#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import lxml
import requests
import xlsxwriter


class Spider():
    # 初始化参数
    def __init__(self, urlList):
        self.urlList = urlList
        self.num = 0
        self.row = 1
        self.col = 0
        self.wookbook = ''
        self.wooksheet = ''
        self.id = 0

    # 下载
    def downLoad(self, add_url):
        print('开始下载html')
        res = requests.get(add_url)
        res.encoding = 'gb2312'
        html = res.text
        self.analy(html)

    # 解析html
    def analy(self, html):
        print('开始解析html')
        soup = BeautifulSoup(html, 'html5lib')
        td_list = soup.select('#__01 tbody div div table tbody tr td')
        for index,title in enumerate(td_list):
            if index % 2==0:
                self.row += 1
                self.wooksheet.write(self.row, 0, title.get_text())
                self.id += 1
                print('%s%s'%('已完成',self.id))
            else:
                self.wooksheet.write(self.row, 1, title.get_text())
                self.id += 1
                print('%s%s'%('已完成',self.id))

    # 主调度函数
    def main(self):
        self.wookbook = xlsxwriter.Workbook('/home/wwwroot/default/python/cy.xlsx')
        self.wooksheet = self.wookbook.add_worksheet()
        self.wooksheet.write(self.row, self.col, '谜题')
        self.wooksheet.write(self.row, self.col+1, '答案')
        for url in self.urlList:
            self.downLoad(url)
        self.wookbook.close()


# 入口函数
if __name__ == "__main__":
    urlList = ["http://www.hydcd.com/baike/cymy3.htm"]
    craw = Spider(urlList)
    craw.main()